<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="htmlparser-conf.xsl"?>

<!-- Do not modify this file directly.  Instead, copy entries that you -->
<!-- wish to modify from this file into htmlparser-site.xml and change them -->
<!-- there.  If htmlparser-site.xml does not already exist, create it.      -->

<htmlparser-conf>

<!-- HTTP properties -->

<property>
  <name>http.agent.name</name>
  <value>WinistaBot</value>
  <description>Our HTTP 'User-Agent' request header.</description>
</property>

<property>
  <name>http.robots.agents</name>
  <value>WinistaBot,Winista,*</value>
  <description>The agent strings we'll look for in robots.txt files,
  comma-separated, in decreasing order of precedence.</description>
</property>

<property>
  <name>http.robots.403.allow</name>
  <value>true</value>
  <description>Some servers return HTTP status 403 (Forbidden) if
  /robots.txt doesn't exist. This should probably mean that we are
  allowed to crawl the site nonetheless. If this is set to false,
  then such sites will be treated as forbidden.</description>
</property>

<property>
  <name>http.agent.description</name>
  <value>Winista</value>
  <description>Further description of our bot- this text is used in
  the User-Agent header.  It appears in parenthesis after the agent name.
  </description>
</property>

<property>
  <name>http.agent.url</name>
  <value>http://www.netomatix.com/Winista/bot.html</value>
  <description>A URL to advertise in the User-Agent header.  This will 
   appear in parenthesis after the agent name.
  </description>
</property>

<property>
  <name>http.agent.email</name>
  <value>support@netomatix.com</value>
  <description>An email address to advertise in the HTTP 'From' request
   header and User-Agent header.</description>
</property>

<property>
  <name>http.agent.version</name>
  <value>1.6.2-dev</value>
  <description>A version string to advertise in the User-Agent 
   header.</description>
</property>

<property>
  <name>http.timeout</name>
  <value>10000</value>
  <description>The default network timeout, in milliseconds.</description>
</property>

<property>
  <name>http.max.delays</name>
  <value>3</value>
  <description>The number of times a thread will delay when trying to
  fetch a page.  Each time it finds that a host is busy, it will wait
  fetcher.server.delay.  After http.max.delays attepts, it will give
  up on the page for now.</description>
</property>

<property>
  <name>http.content.limit</name>
  <value>65536</value>
  <description>The length limit for downloaded content, in bytes.
  If this value is nonnegative (>=0), content longer than it will be truncated;
  otherwise, no truncation at all.
  </description>
</property>

<property>
  <name>http.proxy.host</name>
  <value></value>
  <description>The proxy hostname.  If empty, no proxy is used.</description>
</property>

<property>
  <name>http.proxy.port</name>
  <value></value>
  <description>The proxy port.</description>
</property>

<property>
  <name>http.verbose</name>
  <value>false</value>
  <description>If true, HTTP will log more verbosely.</description>
</property>

<property>
  <name>http.redirect.max</name>
  <value>3</value>
  <description>The maximum number of redirects the fetcher will follow when
    trying to fetch a page.</description>
</property>

<!-- web db properties -->

<property>
  <name>db.default.fetch.interval</name>
  <value>30</value>
  <description>The default number of days between re-fetches of a page.
  </description>
</property>

<property>
  <name>db.ignore.internal.links</name>
  <value>true</value>
  <description>If true, when adding new links to a page, links from
  the same host are ignored.  This is an effective way to limit the
  size of the link database, keeping the only the highest quality
  links.
  </description>
</property>

<property>
  <name>db.score.injected</name>
  <value>1.0</value>
  <description>The score of new pages added by the injector.
  </description>
</property>

<property>
  <name>db.score.link.external</name>
  <value>1.0</value>
  <description>The score factor for new pages added due to a link from
  another host relative to the referencing page's score.
  </description>
</property>

<property>
  <name>db.score.link.internal</name>
  <value>1.0</value>
  <description>The score factor for pages added due to a link from the
  same host, relative to the referencing page's score.
  </description>
</property>

<property>
  <name>db.max.outlinks.per.page</name>
  <value>100</value>
  <description>The maximum number of outlinks that we'll process for a page.
  </description>
</property>

<property>
  <name>db.max.anchor.length</name>
  <value>100</value>
  <description>The maximum number of characters permitted in an anchor.
  </description>
</property>

<property>
  <name>db.fetch.retry.max</name>
  <value>3</value>
  <description>The maximum number of times a url that has encountered
  recoverable errors is generated for fetch.</description>
</property>

<!-- fetchlist tool properties -->

<property>
  <name>fetchlist.score.by.link.count</name>
  <value>true</value>
  <description>If true, set page scores on fetchlist entries based on
  log(number of anchors), instead of using original page scores. This
  results in prioritization of pages with many incoming links.
  </description>
</property>

<!-- fetcher properties -->

<property>
  <name>fetcher.server.delay</name>
  <value>5.0</value>
  <description>The number of seconds the fetcher will delay between 
   successive requests to the same server.</description>
</property>

<property>
  <name>fetcher.threads.fetch</name>
  <value>10</value>
  <description>The number of FetcherThreads the fetcher should use.
    This is also determines the maximum number of requests that are 
    made at once (each FetcherThread handles one connection).</description>
</property>

<property>
  <name>fetcher.threads.per.host</name>
  <value>1</value>
  <description>This number is the maximum number of threads that
    should be allowed to access a host at one time.</description>
</property>

<property>
  <name>fetcher.verbose</name>
  <value>false</value>
  <description>If true, fetcher will log more verbosely.</description>
</property>

<!-- parser properties -->
<property>
  <name>parser.threads.parse</name>
  <value>10</value>
  <description>Number of ParserThreads ParseSegment should use.</description>
</property>

<!-- parser properties -->

<property>
  <name>parser.character.encoding.default</name>
  <value>windows-1252</value>
  <description>The character encoding to fall back to when no other information
  is available</description>
</property>

<property>
  <name>parser.html.impl</name>
  <value>Winista.HTMLParser</value>
  <description>HTML Parser implementation.
  </description>
</property>

<!-- query-basic plugin properties -->

<property>
  <name>query.url.boost</name>
  <value>4.0</value>
  <description> Used as a boost for url field in Lucene query.
  </description>
</property>

<property>
  <name>query.anchor.boost</name>
  <value>2.0</value>
  <description> Used as a boost for anchor field in Lucene query.
  </description>
</property>


<property>
  <name>query.title.boost</name>
  <value>1.5</value>
  <description> Used as a boost for title field in Lucene query.
  </description>
</property>

<property>
  <name>query.host.boost</name>
  <value>2.0</value>
  <description> Used as a boost for host field in Lucene query.
  </description>
</property>

<property>
  <name>query.phrase.boost</name>
  <value>1.0</value>
  <description> Used as a boost for phrase in Lucene query.
  Multiplied by boost for field phrase is matched in.
  </description>
</property>

</htmlparser-conf>
